Fan Wang
June 10th 2022
The Cancer Cell Line Encyclopedia (CCLE) project is an effort to conduct a detailed genetic characterization of a large panel of human cancer cell lines. The CCLE provides public access analysis and visualization of DNA copy number, mRNA expression, mutation data and more, for more than 1000 cancer cell lines. This notebook demonstrates how to visualize the gene expression for cell lines of interest. CCLE gene expression data was pulled from BRH and formatted using Pandas and then visualized using Seaborn.
import pandas as pd
import seaborn as sns
sns.set(style="ticks", color_codes=True)
get_ipython().run_line_magic("config", "InlineBackend.figure_format = 'svg'")
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
import warnings
warnings.filterwarnings("ignore")
!pip install gen3 -U
!gen3 drs-pull object dg.OADC/41c3f1ac-2cc7-4b04-b09c-a9c5dbad2c98 --no-unpack-packages
path_to_zip_file_expression ="CCLE_data_22Q2.zip"
with zipfile.ZipFile(path_to_zip_file_expression, 'r') as zip_ref:
zip_ref.extractall()
# Read sample_info and insert only cell lines with lineage 'lung'.
sample_info = pd.read_csv("CCLE_data_22Q2/sample_info_22Q2.csv")
lung = sample_info[sample_info["lineage"] == "lung"]
# Read the expression data and edit the gene column name.
expression = pd.read_csv("CCLE_data_22Q2/CCLE_expression_22Q2.csv", sep=",")
expression.columns = [line.split(" ")[0] for line in expression.columns.to_list()]
expression.rename(columns={"Unnamed:": "Cell Line"}, inplace=True)
# Merge expression data of cells of blood lineage into data frame called lungexpressAll.
lungexpressAll = pd.merge(
lung, expression, left_on="DepMap_ID", right_on="Cell Line", how="inner"
)
# Create a list of unnecessary columns and delete them.
removecolumnlist = lung.columns.to_list()
removecolumnlist.remove("stripped_cell_line_name")
removecolumnlist.append("Cell Line")
lungexpressAll.drop(removecolumnlist, axis=1, inplace=True)
lungexpressAll.set_index("stripped_cell_line_name", inplace=True)
# Swap columns and indexes.
LE = lungexpressAll.transpose()
# Select the top 2000 HVGs with the largest variation,
Top2000 = LE.var(axis=1).sort_values(ascending=False)[0:2000]
val = LE.loc[Top2000.index]
correlation = val.corr()
correlation
plt.figure(figsize=(15,15))
sns.heatmap (correlation)
SCLC = lung[lung["lineage_subtype"] == "NSCLC"]
SCLCexpressAll = pd.merge(
SCLC, expression, left_on="DepMap_ID", right_on="Cell Line", how="inner"
)
removecolumnlist = SCLC.columns.to_list()
removecolumnlist.remove("stripped_cell_line_name")
removecolumnlist.append("Cell Line")
SCLCexpressAll.drop(removecolumnlist, axis=1, inplace=True)
SCLCexpressAll.set_index("stripped_cell_line_name", inplace=True)
SCLC_expression = SCLCexpressAll.transpose()
Top2000 = SCLC_expression.var(axis=1).sort_values(ascending=False)[0:2000]
val = SCLC_expression.loc[Top2000.index]
correlation = val.corr()
plt.figure(figsize=(15, 15))
sns.heatmap(correlation)
SCLC = lung[lung["lineage_subtype"] == "SCLC"]
SCLCexpressAll = pd.merge(
SCLC, expression, left_on="DepMap_ID", right_on="Cell Line", how="inner"
)
removecolumnlist = SCLC.columns.to_list()
removecolumnlist.remove("stripped_cell_line_name")
removecolumnlist.append("Cell Line")
SCLCexpressAll.drop(removecolumnlist, axis=1, inplace=True)
SCLCexpressAll.set_index("stripped_cell_line_name", inplace=True)
SCLC_expression = SCLCexpressAll.transpose()
Top2000 = SCLC_expression.var(axis=1).sort_values(ascending=False)[0:2000]
val = SCLC_expression.loc[Top2000.index]
correlation = val.corr()
plt.figure(figsize=(15, 15))
sns.heatmap(correlation)
plt.figure(figsize=(13,13))
g = sns.clustermap(val,figsize=(13,13))
twoset = val[["SW1271", "NCIH2286"]]
twoset
sns.lmplot(data=twoset, x="SW1271", y="NCIH2286")
twoset["SW1271"].corr(twoset["NCIH2286"])
The correlation coefficient is 0.75, which shows a fairly high correlation. Being located nearby in the clustermap means that the same gene is expressed in a similar manner in two cell lines.
otherset = val[["SW1271", "NCIH2171"]]
sns.lmplot(data=otherset, x="SW1271", y="NCIH2171")
otherset["SW1271"].corr(otherset["NCIH2171"])
As we expected, the correlation is low between two distant cell lines in the clustermap.
sets = val[["SW1271", "NCIH2286", "NCIH2171", "NCIH196"]]
g = sns.PairGrid(sets)
g.map(sns.scatterplot)
sets = val[["SW1271", "NCIH2286", "NCIH2171", "NCIH196"]]
g = sns.PairGrid(sets)
g.map_diag(sns.histplot)
g.map_offdiag(sns.scatterplot)
g = sns.PairGrid(sets, diag_sharey=False, corner=True)
g.map_lower(sns.scatterplot)
g.map_diag(sns.kdeplot)
We retrieved CCLE expression dataset from BRH. Then we focused on the top 2000 genes with the largest variation and ploted a clustermap for these highly variable genes (HVG). From the clustermap, nearby and distant cell lines were selected and correlation was visualized. Then we ploted the correlation for muiltiple cell lines as a scatter plot using PairGrid.